from gxl_ai_utils.utils import utils_file

input_data_list_path = "/home/work_nfs23/asr_data/data/osum_chat/s2s_paralanguage/s2s_caption_explicit_single_label_367k/data.list"
output_data_list_path = "/home/work_nfs23/asr_data/data/osum_chat/s2s_paralanguage/s2s_caption_explicit_single_label_367k/data_new.list"
shards_dir = "/home/work_nfs23/asr_data/data/osum_chat/s2s_paralanguage/s2s_caption_explicit_single_label_367k"
data_list = utils_file.load_dict_list_from_jsonl(input_data_list_path)
new_data_list = []
for dict_i in data_list:
    caption = dict_i['caption']
    extra_info = dict_i['extra']
    extra_info['caption'] = caption
    dict_i['extra'] = extra_info
    new_data_list.append(dict_i)
utils_file.write_dict_list_to_jsonl(new_data_list, output_data_list_path)


utils_file.print_list(new_data_list[:10])

from make_shard_common import make_shards_common
make_shards_common(
    jsonl_file=output_data_list_path,
    shards_dir=shards_dir,
    num_threads=32
)



